# clean up the RStudio environment
rm(list = ls())
# # Loading packages
library(DataComputing)
library(mosaic)
library(tidyverse)
library(lubridate)
library(ggplot2)
library(ggalt)
library(corrplot)
library(plotly)
library(gridExtra)
Since the most recent available data was from 2019, I’m going to compare and contrast using the data of 2015 and 2019.
# Data sources
df2015 <- read.csv("2015.csv")
df2019 <- read.csv("2019.csv")
# Data inspection
head(df2015, 10)
head(df2019, 10)
summary(df2015)
Country Region Happiness.Rank Happiness.Score Standard.Error
Length:158 Length:158 Min. : 1.00 Min. :2.839 Min. :0.01848
Class :character Class :character 1st Qu.: 40.25 1st Qu.:4.526 1st Qu.:0.03727
Mode :character Mode :character Median : 79.50 Median :5.232 Median :0.04394
Mean : 79.49 Mean :5.376 Mean :0.04788
3rd Qu.:118.75 3rd Qu.:6.244 3rd Qu.:0.05230
Max. :158.00 Max. :7.587 Max. :0.13693
Economy..GDP.per.Capita. Family Health..Life.Expectancy. Freedom
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.5458 1st Qu.:0.8568 1st Qu.:0.4392 1st Qu.:0.3283
Median :0.9102 Median :1.0295 Median :0.6967 Median :0.4355
Mean :0.8461 Mean :0.9910 Mean :0.6303 Mean :0.4286
3rd Qu.:1.1584 3rd Qu.:1.2144 3rd Qu.:0.8110 3rd Qu.:0.5491
Max. :1.6904 Max. :1.4022 Max. :1.0252 Max. :0.6697
Trust..Government.Corruption. Generosity Dystopia.Residual
Min. :0.00000 Min. :0.0000 Min. :0.3286
1st Qu.:0.06168 1st Qu.:0.1506 1st Qu.:1.7594
Median :0.10722 Median :0.2161 Median :2.0954
Mean :0.14342 Mean :0.2373 Mean :2.0990
3rd Qu.:0.18025 3rd Qu.:0.3099 3rd Qu.:2.4624
Max. :0.55191 Max. :0.7959 Max. :3.6021
summary(df2019)
Overall.rank Country.or.region Score GDP.per.capita Social.support
Min. : 1.00 Length:156 Min. :2.853 Min. :0.0000 Min. :0.000
1st Qu.: 39.75 Class :character 1st Qu.:4.545 1st Qu.:0.6028 1st Qu.:1.056
Median : 78.50 Mode :character Median :5.380 Median :0.9600 Median :1.272
Mean : 78.50 Mean :5.407 Mean :0.9051 Mean :1.209
3rd Qu.:117.25 3rd Qu.:6.184 3rd Qu.:1.2325 3rd Qu.:1.452
Max. :156.00 Max. :7.769 Max. :1.6840 Max. :1.624
Healthy.life.expectancy Freedom.to.make.life.choices Generosity Perceptions.of.corruption
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.5477 1st Qu.:0.3080 1st Qu.:0.1087 1st Qu.:0.0470
Median :0.7890 Median :0.4170 Median :0.1775 Median :0.0855
Mean :0.7252 Mean :0.3926 Mean :0.1848 Mean :0.1106
3rd Qu.:0.8818 3rd Qu.:0.5072 3rd Qu.:0.2482 3rd Qu.:0.1412
Max. :1.1410 Max. :0.6310 Max. :0.5660 Max. :0.4530
nrow(df2015)
[1] 158
ncol(df2015)
[1] 12
nrow(df2019)
[1] 156
ncol(df2019)
[1] 9
I noticed that although the number of rows and columns are not the same for df2015 and df2019, there are not many differences, and I’m going to select the features they both have for future analysis.
Features in both table: happiness rank, country/region, happiness score, GDP, family/social support, Healthy life expectancy, freedom, generosity, and perception of corruption.
Year <- c(2015)
Year2015<- data.frame(Year, df2015)
names(Year2015) <- c("Year", "Country", " Region", "Happiness_Rank","Happiness_Score","Standard_Error", "Economy_GDP_per_Capita","Social_Support","Health_Life_Expectancy","Freedom","Trust_Government_Corruption", "Generosity", "Dystopia_Residual")
Year2 <- c(2019)
Year2019 <- data.frame(Year2, df2019)
names(Year2019) <- c("Year", "Happiness_Rank","Country","Happiness_Score","Economy_GDP_per_Capita","Social_Support","Health_Life_Expectancy","Freedom","Generosity","Trust_Government_Corruption")
New2015 <- Year2015 %>%
select("Happiness_Rank","Country","Happiness_Score","Economy_GDP_per_Capita","Social_Support","Health_Life_Expectancy","Freedom","Generosity","Trust_Government_Corruption", "Year")
#convert character to numeric
FinalTable <- rbind(New2015, Year2019, by= "Country")
FinalTable[3:10] <- lapply(FinalTable[3:10], as.numeric)
NAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercion
head(New2015)
head(Year2019)
head(FinalTable)
NA
Overall correlation
data2015 <- FinalTable %>%
filter(Year=="2015")
cormatrix2015 <- cor(data2015[3:9],use="pairwise.complete.obs")
corrplot(cormatrix2015, method = "number",tl.cex = 0.75, title="Correlation between Happiness scores and other attributs 2015",mar=c(0,0,1,0))
data2019 <- FinalTable %>%
filter(Year=="2019")
cormatrix2019 <- cor(data2019[3:9],use="pairwise.complete.obs")
corrplot(cormatrix2019, method = "number",tl.cex = 0.75, title="Correlation between Happiness scores and other attributs 2019",mar=c(0,0,1,0))
data2019
Side-by-side boxplot to show how factors related to happiness score have changed over time:
par(mfrow=c(2,3))
boxplot(Economy_GDP_per_Capita~Year,data=FinalTable)
boxplot(Social_Support~Year,data=FinalTable)
boxplot(Health_Life_Expectancy~Year,data=FinalTable)
boxplot(Freedom~Year,data=FinalTable)
boxplot(Generosity~Year,data=FinalTable)
boxplot(Trust_Government_Corruption~Year,data=FinalTable)
I’m going to list top 10 countries in 2015 and 2019.
#top 10
Top2015 <- data2015 %>% select(Country, Happiness_Score) %>% head(n=10)
Top2019 <- data2019 %>% select(Country, Happiness_Score) %>% head(n=10)
Top2015 <- ggplot(Top2015,aes(x=factor(Country, levels=Country), y=Happiness_Score)) + geom_bar(stat="identity", width = 0.5, fill="darkblue") +labs(title="Top 10 Happiest Countries 2015", x="Country ranked from 1 to 10", y="Score")
Top2019 <- ggplot(Top2019,aes(x=factor(Country, levels=Country), y=Happiness_Score)) +geom_bar(stat="identity", width = 0.5, fill="darkgreen")+labs(title="Top 10 Happiest Countries 2019", x="Country ranked from 1 to 10", y="Score")
grid.arrange(Top2015, Top2019)
Freedom2015 <- plot_ly(data2015, x=~Happiness_Score,
y = ~Freedom,
color = ~Country,
size= ~Happiness_Score)%>%
layout(title="Freedom vs. Happiness Score (2015)",
xaxis= list(title = "Happiness Score"),
yaxis= list(title = "Freedom"))
Freedom2019 <- plot_ly(data2019, x=~Happiness_Score,
y = ~Freedom,
color = ~Country,
size= ~Happiness_Score)%>%
layout(title="Freedom vs. Happiness Score (2019)",
xaxis= list(title = "Happiness Score"),
yaxis= list(title = "Freedom"))
htmltools::tagList(list(Freedom2015))
htmltools::tagList(list(Freedom2019))